From an interview question for Quora
In Python, write a class or module with a bunch of functions for manipulating a URI. For this exercise, pretend that the urllib, urllib2, and urlparse modules don't exist. You can use other standard Python modules, such as re, for this. The focus of the class or module you write should be around usage on the web, so you'll want to have things that make it easier to update or append a querystring var, get the scheme for a URI, etc., and you may want to include ways to figure out the domain for a URL (British-site.co.uk, us-site.com, etc.)
We're looking for correctness (you'll probably want to read the relevant RFCs; make sure you handle edge cases), and elegance of your API (does it let you do the things you commonly want to do with URIs in a really straightforward way?,) as well as coding style. If you don't know Python already, then this is also an exercise in learning new things quickly and well. Your code should be well-commented and documented and conform to the guidlines in the PEP 8 Style Guide for Python Code. Include some instructions and examples of usage in your documentation. You may also want to write unit tests.
This is my attempt at this interview question for quora, to be honest I'm not sure how much time you would be given to implement this, but I spent much more than would probably be alloted in an interview scenario, however the basic idea of using the regex from the rfc for setting / getting the various parts of the uri could feasibly done fairly quickly. However for making the api nice, handling various edge cases, doc strings, tests etc. I think you definitely need several hours (I spent around 5 hours so far), and still I feel like I'm rushing. To get this code production ready with good documentation and a good set of tests covering all kinds of edge cases I would be looking at spending days of work rather than hours.
In [19]:
import re
# TODO :
# - username setter
# - password setter
# - host setter
# - port setter
# - special case for file (no authority)
# - more edge case tests
# - dunder methods operator overloading
# - would be nice to have validation on __init__ and after setters
# - add caching so that the results or various property getters are
# stored, and invalidated when set
class Uri(object):
"""
Uniform resource identifier identifying an
abstract or physical resource.
See https://tools.ietf.org/html/rfc3986
"""
# Dunder Methods...
def __init__(self, uri_string):
"""
Construct a new uri.
:param uri_string: The string representation of the uri.
:type uri_string: str
"""
self.uri_string = uri_string
def __str__(self):
"""
Get the readable string representation of this uri.
:return: String representation of this uri.
:rtype: str
"""
return self.uri_string
`
def __repr__(self):
"""
Get a representation of this uri.
:return: String representation of this uri.
:rtype: str
"""
return 'Uri(%s)' % self.uri_string
# Public Properties...
@property
def scheme(self):
"""
Get the uri scheme, this is the first part of the uri, for example http, ftp, file.
:return: The uri scheme.
:rtype: str
"""
return self._get_uri_part(2)
@scheme.setter
def scheme(self, value):
"""
Set the uri scheme, this is the first part of the uri, for example http, ftp, file.
:param value: The uri scheme to set.
:type value: str
"""
self._set_uri_part(2, value)
@property
def authority(self):
"""
Get the authority this is the [username:password@]host[:port].
:return: The authority.
:rtype: str
"""
return self._get_uri_part(4)
@authority.setter
def authority(self, value):
"""
Set the authority this is the userinfo + host.
:param value: The authority to set.
:type value: str
"""
self._set_uri_part(4, value)
@property
def username(self):
"""
Get the username part of the userinfo.
:return: The username.
:rtype: str
"""
return self.userinfo.split(':')[0]
@username.setter
def username(self, value):
"""
Set the username part of the userinfo.
:param value: The username to set.
:type value: str
"""
# TODO
@property
def password(self):
"""
Get the password part of the userinfo.
:return: The password.
:rtype: str
"""
tokens = self.userinfo.split(':')
return '' if len(tokens) == 1 else tokens[1]
@password.setter
def password(self, value):
"""
Set the password part of the userinfo.
:param value: The password to set.
:type value: str
"""
# TODO
@property
def userinfo(self):
"""
Get the userinfo in the form username:password (password is optional).
:return: The userinfo scheme.
:rtype: str
"""
tokens = self.authority.split('@')
return '' if len(tokens) == 1 else tokens[0]
@userinfo.setter
def userinfo(self, value):
"""
Set the userinfo in the form username:password (password is optional).
:param value: The userinfo to set.
:type value: str
"""
# TODO
@property
def host_and_port(self):
"""
Get the host and possibly port part of the authority.
:return: The host (and port) part of the authority.
:rtype: str
"""
tokens = self.authority.split('@')
return tokens[0] if len(tokens) == 1 else tokens[1]
@host_and_port.setter
def host_and_port(self, value):
"""
Set the host and possibly port part of the authority.
:param value: The host (and port) to set.
:type value: str
"""
# TODO
@property
def host(self):
"""
Get the host part of the authority.
:return: The host part of the authority.
:rtype: str
"""
return self.host_and_port.split(':')[0]
@host.setter
def host(self, value):
"""
Set the host part of the authority.
:param value: The host to set.
:type value: str
"""
# TODO
@property
def port(self):
"""
Get the port part of the authority.
:return: The port part of the authority.
:rtype: int or None if no port specified
"""
tokens = self.host_and_port.split(':')
return None if len(tokens) == 1 else int(tokens[1])
@port.setter
def port(self, value):
"""
Set the port part of the authority.
:param value: The port to set.
:type value: str
"""
# TODO
@property
def path(self):
"""
Get the path of the resource.
:return: The path.
:rtype: str
"""
# TODO : Special case for file uri
return self._get_uri_part(5)
@path.setter
def path(self, value):
"""
Set the path of the resource.
:param value: The host to set.
:type value: str
"""
# TODO : Special case for file uri
if value[0] != '/':
value = '/' + value
self._set_uri_part(5, value)
@property
def query_string(self):
"""
Get the whole query string.
Note it is possible to get the query parameters as a dict
using get_query_params.
:return: The query string.
:rtype: str
"""
return self._get_uri_part(7)
@query_string.setter
def query_string(self, value):
"""
Set the whole query string.
Note it is possible to set individual query parameters using the
method set_query_params.
:param value: The query string.
:type value: str
"""
self._set_uri_part(7, value)
@property
def query_params(self):
"""
Get the decoded query parameters of the uri as a dictionary,
for example:
>>> Uri('http://www.web.com?param1=value1;param2=value2').query
{'param1':'value1', 'param2':'value2'}
:return: The query parameters.
:rtype: dict
"""
query_string = self.query_string
if len(query_string):
result = dict(x.split('=') for x in query_string.split(';'))
else:
result = {}
return result
def set_query_params(self, **query_params):
"""
Set query parameters of the uri as a dictionary, the values are
encoded to ensure a valid uri results even when setting non valid
uri characters, for example:
>>> url = Uri('http://www.web.com')
>>> url.set_query_params(param2='val/ue2', param3='value3')
>>> url
http://www.web.com?param1=value1;param2=val%2fue2;param3=value3
It is possible to perform a partial update of the parameters.
:param **query_params: named parameters to set.
"""
current_query_params = self.query_params
current_query_params.update(query_params)
query_string = ';'.join(
'='.join((param, value))
for param, value in current_query_params.iteritems()
)
self.query_string = query_string
@property
def fragment(self):
"""
Get the fragment part of the uri.
:return: The fragment.
:rtype: str
"""
return self._get_uri_part(9)
@fragment.setter
def fragment(self, value):
"""
Set the fragment part of the uri.
:param value: The fragment.
:type value: str
"""
self._set_uri_part(9, value)
# Public Class Methods...
@classmethod
def encode(self, input_string):
"""
Encode a string so that it is safe to be used as part of a uri.
:param input_string: The string to encode.
"""
return ''.join(
c if c.isalnum() else '%' + c.encode('hex') .upper()
for c in input_string
)
@classmethod
def decode(self, input_string):
"""
Decode a uri encoded string that it is safe to be used as part of a uri.
:param input_string: The encoded string to decode.
"""
it = iter(input_string)
return ''.join(
c if c != '%' else (next(it) + next(it)).decode('hex')
for c in it
)
# Protected Class Members...
_regex = re.compile('^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?')
# Protected Instance Methods...
def _get_uri_part(self, index):
"""
Use the regular expression from the uri rfc for retrieving
the particular section from the uri.
:param index: The index of the matching group to retrieve, see
the uri rfc for information as to which matching
group relate to the various parts of the uri.
:type index: int or str repr of an int
:return: The sub part of the uri relating to the requested
matching group, or '' if this part is not present.
:rtype: str
"""
try:
return self._regex.sub('\\' + str(index), self.uri_string)
except re.error:
return ''
def _set_uri_part(self, index, value):
"""
Generate a substitution string which can be used to
replace the specific part of the uri string with the
value, leaving the rest of the uri intact
:param index: The index of the matching group to retrieve, see
the uri rfc for information as to which matching
group relate to the various parts of the uri.
:type index: int or str repr of an int
:param value: The new value to set
:type value: str
"""
sub = []
prefix_map = { 7:'?', 9:'#'}
postfix_map = { 2:'://' }
for part in (2, 4, 5, 7, 9):
if part != index:
if len(self._get_uri_part(part)):
part_value = '\\' + str(part)
else:
part_value = ''
else:
part_value = value
if len(part_value):
sub.append(prefix_map.get(part, ''))
sub.append(part_value)
sub.append(postfix_map.get(part, ''))
sub = ''.join(sub)
result = self._regex.sub(sub, self.uri_string)
self.uri_string = result
def uri_test(uri_string):
url = Uri(uri_string)
print_uri_verbose(url)
def print_uri_verbose(url):
print '\n' + str(url)
if len(url.scheme):
print 'scheme | ', url.scheme
if len(url.userinfo):
print 'userinfo | ', url.userinfo
if len(url.username):
print 'username | ', url.username
if len(url.password):
print 'password | ', url.password
if len(url.authority):
print 'authority | ', url.authority
if len(url.host_and_port):
print 'host_and_port | ', url.host_and_port
if len(url.host):
print 'host | ', url.host
if url.port is not None:
print 'port | ', url.port
if len(url.path):
print 'path | ', url.path
if len(url.query_string):
print 'query_string | ', url.query_string
if len(url.query_params):
print 'query_params | ', url.query_params
if len(url.fragment):
print 'fragment | ', url.fragment
print '\n', '-- ENCODE TESTS', '-' * 80, '\n'
to_encode = 'this is invalid \\, also % is not valid'
to_decode = Uri.encode(to_encode)
print to_decode
decoded = Uri.decode(to_decode)
print decoded
assert to_encode == decoded
print '\n', '-- GETTER TESTS', '-' * 80
# getter tests
uri_test('http://username:password@www.website.com:8080/path?param=1#fragment')
uri_test('http://username:password@www.website.com/path?param=1#fragment')
uri_test('http://username:password@www.website.com/path?param=1')
uri_test('http://username@www.website.com/path?param=1')
uri_test('http://www.website.com/path?param=1')
uri_test('http://www.website.com/path')
uri_test('http://www.website.com')
uri_test('http://www.website.com')
print '\n', '-- SETTER TESTS', '-' * 80
# setter tests
url = Uri('http://username:password@www.website.com/path')
print "\n", url, "\nurl.scheme = 'ftp'",
url.scheme = 'ftp'
print_uri_verbose(url)
print "\n", url, "\nurl.userinfo = 'me:secret'",
url.userinfo = 'me:secret'
print_uri_verbose(url)
print "\n", url, "\nurl.username = 'mio'",
url.username = 'mio'
print_uri_verbose(url)
print "\n", url, "\nurl.password = 'secreto'",
url.password = 'secreto'
print_uri_verbose(url)
print "\n", url, "\nurl.authority = 'me:secret@www.google.com:80'",
url.authority = 'me:secret@www.google.com:80'
print_uri_verbose(url)
print "\n", url, "\nurl.host_and_port = 'www.google.com:8888'",
url.host_and_port = 'www.google.com:8888'
print_uri_verbose(url)
print "\n", url, "\nurl.host = 'www.google.com'",
url.host = 'www.google.com'
print_uri_verbose(url)
print "\n", url, "\nurl.port = 8090",
url.port = 8090
print_uri_verbose(url)
print "\n", url, "\nurl.path = 'index.html'",
url.path = 'index.html'
print_uri_verbose(url)
print "\n", url, "\nurl.path = 'index.html'",
url.path = '/index.html'
print_uri_verbose(url)
print "\n", url, "\nurl.query_string = 'param1=value1'",
url.query_string = 'param1=value1'
print_uri_verbose(url)
print "\n", url, "\nurl.set_query_params(param1='newvalue1', param2='param2')",
url.set_query_params(param1='newvalue1', param2='param2')
print_uri_verbose(url)
print "\n", url, "\nurl.fragment = 'sub_section'",
url.fragment = 'sub_section'
print_uri_verbose(url)